from warnings import filterwarnings
filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv('training_set.csv')
df.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
df.shape
(1460, 81)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1460 entries, 0 to 1459 Data columns (total 81 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 1460 non-null int64 1 MSSubClass 1460 non-null int64 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 LotArea 1460 non-null int64 5 Street 1460 non-null object 6 Alley 91 non-null object 7 LotShape 1460 non-null object 8 LandContour 1460 non-null object 9 Utilities 1460 non-null object 10 LotConfig 1460 non-null object 11 LandSlope 1460 non-null object 12 Neighborhood 1460 non-null object 13 Condition1 1460 non-null object 14 Condition2 1460 non-null object 15 BldgType 1460 non-null object 16 HouseStyle 1460 non-null object 17 OverallQual 1460 non-null int64 18 OverallCond 1460 non-null int64 19 YearBuilt 1460 non-null int64 20 YearRemodAdd 1460 non-null int64 21 RoofStyle 1460 non-null object 22 RoofMatl 1460 non-null object 23 Exterior1st 1460 non-null object 24 Exterior2nd 1460 non-null object 25 MasVnrType 588 non-null object 26 MasVnrArea 1452 non-null float64 27 ExterQual 1460 non-null object 28 ExterCond 1460 non-null object 29 Foundation 1460 non-null object 30 BsmtQual 1423 non-null object 31 BsmtCond 1423 non-null object 32 BsmtExposure 1422 non-null object 33 BsmtFinType1 1423 non-null object 34 BsmtFinSF1 1460 non-null int64 35 BsmtFinType2 1422 non-null object 36 BsmtFinSF2 1460 non-null int64 37 BsmtUnfSF 1460 non-null int64 38 TotalBsmtSF 1460 non-null int64 39 Heating 1460 non-null object 40 HeatingQC 1460 non-null object 41 CentralAir 1460 non-null object 42 Electrical 1459 non-null object 43 1stFlrSF 1460 non-null int64 44 2ndFlrSF 1460 non-null int64 45 LowQualFinSF 1460 non-null int64 46 GrLivArea 1460 non-null int64 47 BsmtFullBath 1460 non-null int64 48 BsmtHalfBath 1460 non-null int64 49 FullBath 1460 non-null int64 50 HalfBath 1460 non-null int64 51 BedroomAbvGr 1460 non-null int64 52 KitchenAbvGr 1460 non-null int64 53 KitchenQual 1460 non-null object 54 TotRmsAbvGrd 1460 non-null int64 55 Functional 1460 non-null object 56 Fireplaces 1460 non-null int64 57 FireplaceQu 770 non-null object 58 GarageType 1379 non-null object 59 GarageYrBlt 1379 non-null float64 60 GarageFinish 1379 non-null object 61 GarageCars 1460 non-null int64 62 GarageArea 1460 non-null int64 63 GarageQual 1379 non-null object 64 GarageCond 1379 non-null object 65 PavedDrive 1460 non-null object 66 WoodDeckSF 1460 non-null int64 67 OpenPorchSF 1460 non-null int64 68 EnclosedPorch 1460 non-null int64 69 3SsnPorch 1460 non-null int64 70 ScreenPorch 1460 non-null int64 71 PoolArea 1460 non-null int64 72 PoolQC 7 non-null object 73 Fence 281 non-null object 74 MiscFeature 54 non-null object 75 MiscVal 1460 non-null int64 76 MoSold 1460 non-null int64 77 YrSold 1460 non-null int64 78 SaleType 1460 non-null object 79 SaleCondition 1460 non-null object 80 SalePrice 1460 non-null int64 dtypes: float64(3), int64(35), object(43) memory usage: 924.0+ KB
df.describe()
| Id | MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | ... | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1460.000000 | 1460.000000 | 1201.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1452.000000 | 1460.000000 | ... | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 |
| mean | 730.500000 | 56.897260 | 70.049958 | 10516.828082 | 6.099315 | 5.575342 | 1971.267808 | 1984.865753 | 103.685262 | 443.639726 | ... | 94.244521 | 46.660274 | 21.954110 | 3.409589 | 15.060959 | 2.758904 | 43.489041 | 6.321918 | 2007.815753 | 180921.195890 |
| std | 421.610009 | 42.300571 | 24.284752 | 9981.264932 | 1.382997 | 1.112799 | 30.202904 | 20.645407 | 181.066207 | 456.098091 | ... | 125.338794 | 66.256028 | 61.119149 | 29.317331 | 55.757415 | 40.177307 | 496.123024 | 2.703626 | 1.328095 | 79442.502883 |
| min | 1.000000 | 20.000000 | 21.000000 | 1300.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 | 34900.000000 |
| 25% | 365.750000 | 20.000000 | 59.000000 | 7553.500000 | 5.000000 | 5.000000 | 1954.000000 | 1967.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 5.000000 | 2007.000000 | 129975.000000 |
| 50% | 730.500000 | 50.000000 | 69.000000 | 9478.500000 | 6.000000 | 5.000000 | 1973.000000 | 1994.000000 | 0.000000 | 383.500000 | ... | 0.000000 | 25.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 2008.000000 | 163000.000000 |
| 75% | 1095.250000 | 70.000000 | 80.000000 | 11601.500000 | 7.000000 | 6.000000 | 2000.000000 | 2004.000000 | 166.000000 | 712.250000 | ... | 168.000000 | 68.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 2009.000000 | 214000.000000 |
| max | 1460.000000 | 190.000000 | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.000000 | 5644.000000 | ... | 857.000000 | 547.000000 | 552.000000 | 508.000000 | 480.000000 | 738.000000 | 15500.000000 | 12.000000 | 2010.000000 | 755000.000000 |
8 rows × 38 columns
df.columns
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
'SaleCondition', 'SalePrice'],
dtype='object')
s = df.isna().sum()
s[s>0]
LotFrontage 259 Alley 1369 MasVnrType 872 MasVnrArea 8 BsmtQual 37 BsmtCond 37 BsmtExposure 38 BsmtFinType1 37 BsmtFinType2 38 Electrical 1 FireplaceQu 690 GarageType 81 GarageYrBlt 81 GarageFinish 81 GarageQual 81 GarageCond 81 PoolQC 1453 Fence 1179 MiscFeature 1406 dtype: int64
df = df.drop(columns=['Id'])
df.head()
| MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | Inside | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | FR2 | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | Inside | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | Corner | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | FR2 | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 80 columns
cat = list(df.columns[df.dtypes=='object'])
con = list(df.columns[df.dtypes!='object'])
cat
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
con
['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']
for i in cat:
df[i].value_counts().plot(kind='bar',
title=f'Countplot for {i}',
figsize=(16,8))
plt.show()
for i in con:
sns.histplot(data=df, x=i, kde=True)
plt.title(f'Histogram for {i}')
plt.show()
for i in con:
if i!='SalePrice':
plt.figure(figsize=(16,8))
sns.scatterplot(data=df, x=i, y='SalePrice')
plt.title(f'Scatterplot for {i} vs SalePrice')
plt.show()
for i in cat:
plt.figure(figsize=(16,8))
sns.boxplot(data=df, x=i, y='SalePrice')
plt.title(f'Boxplot for {i} vs SalePrice')
plt.show()
cat
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
ctab1 = pd.crosstab(df['ExterQual'], df['ExterCond'])
ctab1
| ExterCond | Ex | Fa | Gd | Po | TA |
|---|---|---|---|---|---|
| ExterQual | |||||
| Ex | 1 | 0 | 3 | 0 | 48 |
| Fa | 0 | 6 | 0 | 0 | 8 |
| Gd | 0 | 0 | 35 | 0 | 453 |
| TA | 2 | 22 | 108 | 1 | 773 |
sns.heatmap(ctab1, fmt='d', annot=True)
<Axes: xlabel='ExterCond', ylabel='ExterQual'>
x = df.drop(columns=['SalePrice'])
y = df[['SalePrice']]
x.head()
| MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | ... | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | Inside | ... | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal |
| 1 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | FR2 | ... | 0 | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal |
| 2 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | Inside | ... | 0 | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal |
| 3 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | Corner | ... | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml |
| 4 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | FR2 | ... | 0 | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal |
5 rows × 79 columns
y.head()
| SalePrice | |
|---|---|
| 0 | 208500 |
| 1 | 181500 |
| 2 | 223500 |
| 3 | 140000 |
| 4 | 250000 |
cat = list(X.columns[x.dtypes=='object'])
con = list(X.columns[x.dtypes!='object'])
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
num_pipe1 = Pipeline(steps=[('impute', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
cat_pipe1 = Pipeline(steps=[('impute', SimpleImputer(strategy='constant', fill_value='NotAvailable')),
('ordinal', OrdinalEncoder())])
pre1 = ColumnTransformer([('num', num_pipe1, con),
('cat', cat_pipe1, cat)])
X_pre = pre1.fit_transform(x)
X_pre
array([[ 0.07337496, -0.22087509, -0.20714171, ..., 1. ,
8. , 4. ],
[-0.87256276, 0.46031974, -0.09188637, ..., 1. ,
8. , 4. ],
[ 0.07337496, -0.08463612, 0.07347998, ..., 1. ,
8. , 4. ],
...,
[ 0.30985939, -0.1754621 , -0.14781027, ..., 3. ,
8. , 4. ],
[-0.87256276, -0.08463612, -0.08016039, ..., 1. ,
8. , 4. ],
[-0.87256276, 0.23325479, -0.05811155, ..., 1. ,
8. , 4. ]])
cols = pre1.get_feature_names_out()
cols
array(['num__MSSubClass', 'num__LotFrontage', 'num__LotArea',
'num__OverallQual', 'num__OverallCond', 'num__YearBuilt',
'num__YearRemodAdd', 'num__MasVnrArea', 'num__BsmtFinSF1',
'num__BsmtFinSF2', 'num__BsmtUnfSF', 'num__TotalBsmtSF',
'num__1stFlrSF', 'num__2ndFlrSF', 'num__LowQualFinSF',
'num__GrLivArea', 'num__BsmtFullBath', 'num__BsmtHalfBath',
'num__FullBath', 'num__HalfBath', 'num__BedroomAbvGr',
'num__KitchenAbvGr', 'num__TotRmsAbvGrd', 'num__Fireplaces',
'num__GarageYrBlt', 'num__GarageCars', 'num__GarageArea',
'num__WoodDeckSF', 'num__OpenPorchSF', 'num__EnclosedPorch',
'num__3SsnPorch', 'num__ScreenPorch', 'num__PoolArea',
'num__MiscVal', 'num__MoSold', 'num__YrSold', 'cat__MSZoning',
'cat__Street', 'cat__Alley', 'cat__LotShape', 'cat__LandContour',
'cat__Utilities', 'cat__LotConfig', 'cat__LandSlope',
'cat__Neighborhood', 'cat__Condition1', 'cat__Condition2',
'cat__BldgType', 'cat__HouseStyle', 'cat__RoofStyle',
'cat__RoofMatl', 'cat__Exterior1st', 'cat__Exterior2nd',
'cat__MasVnrType', 'cat__ExterQual', 'cat__ExterCond',
'cat__Foundation', 'cat__BsmtQual', 'cat__BsmtCond',
'cat__BsmtExposure', 'cat__BsmtFinType1', 'cat__BsmtFinType2',
'cat__Heating', 'cat__HeatingQC', 'cat__CentralAir',
'cat__Electrical', 'cat__KitchenQual', 'cat__Functional',
'cat__FireplaceQu', 'cat__GarageType', 'cat__GarageFinish',
'cat__GarageQual', 'cat__GarageCond', 'cat__PavedDrive',
'cat__PoolQC', 'cat__Fence', 'cat__MiscFeature', 'cat__SaleType',
'cat__SaleCondition'], dtype=object)
X_pre = pd.DataFrame(X_pre, columns=cols)
X_pre.head()
| num__MSSubClass | num__LotFrontage | num__LotArea | num__OverallQual | num__OverallCond | num__YearBuilt | num__YearRemodAdd | num__MasVnrArea | num__BsmtFinSF1 | num__BsmtFinSF2 | ... | cat__GarageType | cat__GarageFinish | cat__GarageQual | cat__GarageCond | cat__PavedDrive | cat__PoolQC | cat__Fence | cat__MiscFeature | cat__SaleType | cat__SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.073375 | -0.220875 | -0.207142 | 0.651479 | -0.517200 | 1.050994 | 0.878668 | 0.514104 | 0.575425 | -0.288653 | ... | 1.0 | 2.0 | 5.0 | 5.0 | 2.0 | 3.0 | 4.0 | 1.0 | 8.0 | 4.0 |
| 1 | -0.872563 | 0.460320 | -0.091886 | -0.071836 | 2.179628 | 0.156734 | -0.429577 | -0.570750 | 1.171992 | -0.288653 | ... | 1.0 | 2.0 | 5.0 | 5.0 | 2.0 | 3.0 | 4.0 | 1.0 | 8.0 | 4.0 |
| 2 | 0.073375 | -0.084636 | 0.073480 | 0.651479 | -0.517200 | 0.984752 | 0.830215 | 0.325915 | 0.092907 | -0.288653 | ... | 1.0 | 2.0 | 5.0 | 5.0 | 2.0 | 3.0 | 4.0 | 1.0 | 8.0 | 4.0 |
| 3 | 0.309859 | -0.447940 | -0.096897 | 0.651479 | -0.517200 | -1.863632 | -0.720298 | -0.570750 | -0.499274 | -0.288653 | ... | 5.0 | 3.0 | 5.0 | 5.0 | 2.0 | 3.0 | 4.0 | 1.0 | 8.0 | 0.0 |
| 4 | 0.073375 | 0.641972 | 0.375148 | 1.374795 | -0.517200 | 0.951632 | 0.733308 | 1.366489 | 0.463568 | -0.288653 | ... | 1.0 | 2.0 | 5.0 | 5.0 | 2.0 | 3.0 | 4.0 | 1.0 | 8.0 | 4.0 |
5 rows × 79 columns
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
lr = LinearRegression()
sel = SequentialFeatureSelector(lr, direction='backward')
sel_features = sel.fit_transform(X_pre, Y)
sel_cols = sel.get_feature_names_out()
sel_cols
array(['num__MSSubClass', 'num__LotArea', 'num__OverallQual',
'num__OverallCond', 'num__YearBuilt', 'num__MasVnrArea',
'num__BsmtFinSF1', 'num__BsmtFinSF2', 'num__BsmtUnfSF',
'num__TotalBsmtSF', 'num__1stFlrSF', 'num__2ndFlrSF',
'num__BsmtFullBath', 'num__KitchenAbvGr', 'num__TotRmsAbvGrd',
'num__Fireplaces', 'num__GarageCars', 'num__WoodDeckSF',
'num__EnclosedPorch', 'num__ScreenPorch', 'num__YrSold',
'cat__Alley', 'cat__LandContour', 'cat__Neighborhood',
'cat__BldgType', 'cat__HouseStyle', 'cat__RoofMatl',
'cat__MasVnrType', 'cat__ExterQual', 'cat__ExterCond',
'cat__BsmtQual', 'cat__BsmtCond', 'cat__BsmtExposure',
'cat__HeatingQC', 'cat__KitchenQual', 'cat__Functional',
'cat__GarageFinish', 'cat__GarageCond', 'cat__Fence',
'cat__SaleCondition'], dtype=object)
len(sel_cols)
40
imp_cols = []
for i in sel_cols:
col = i.split('__')[1]
imp_cols.append(col)
imp_cols
['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'BsmtFullBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'WoodDeckSF', 'EnclosedPorch', 'ScreenPorch', 'YrSold', 'Alley', 'LandContour', 'Neighborhood', 'BldgType', 'HouseStyle', 'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'KitchenQual', 'Functional', 'GarageFinish', 'GarageCond', 'Fence', 'SaleCondition']
X_sel = X[imp_cols]
X_sel
| MSSubClass | LotArea | OverallQual | OverallCond | YearBuilt | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | ... | BsmtQual | BsmtCond | BsmtExposure | HeatingQC | KitchenQual | Functional | GarageFinish | GarageCond | Fence | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 60 | 8450 | 7 | 5 | 2003 | 196.0 | 706 | 0 | 150 | 856 | ... | Gd | TA | No | Ex | Gd | Typ | RFn | TA | NaN | Normal |
| 1 | 20 | 9600 | 6 | 8 | 1976 | 0.0 | 978 | 0 | 284 | 1262 | ... | Gd | TA | Gd | Ex | TA | Typ | RFn | TA | NaN | Normal |
| 2 | 60 | 11250 | 7 | 5 | 2001 | 162.0 | 486 | 0 | 434 | 920 | ... | Gd | TA | Mn | Ex | Gd | Typ | RFn | TA | NaN | Normal |
| 3 | 70 | 9550 | 7 | 5 | 1915 | 0.0 | 216 | 0 | 540 | 756 | ... | TA | Gd | No | Gd | Gd | Typ | Unf | TA | NaN | Abnorml |
| 4 | 60 | 14260 | 8 | 5 | 2000 | 350.0 | 655 | 0 | 490 | 1145 | ... | Gd | TA | Av | Ex | Gd | Typ | RFn | TA | NaN | Normal |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1455 | 60 | 7917 | 6 | 5 | 1999 | 0.0 | 0 | 0 | 953 | 953 | ... | Gd | TA | No | Ex | TA | Typ | RFn | TA | NaN | Normal |
| 1456 | 20 | 13175 | 6 | 6 | 1978 | 119.0 | 790 | 163 | 589 | 1542 | ... | Gd | TA | No | TA | TA | Min1 | Unf | TA | MnPrv | Normal |
| 1457 | 70 | 9042 | 7 | 9 | 1941 | 0.0 | 275 | 0 | 877 | 1152 | ... | TA | Gd | No | Ex | Gd | Typ | RFn | TA | GdPrv | Normal |
| 1458 | 20 | 9717 | 5 | 6 | 1950 | 0.0 | 49 | 1029 | 0 | 1078 | ... | TA | TA | Mn | Gd | Gd | Typ | Unf | TA | NaN | Normal |
| 1459 | 20 | 9937 | 5 | 6 | 1965 | 0.0 | 830 | 290 | 136 | 1256 | ... | TA | TA | No | Gd | TA | Typ | Fin | TA | NaN | Normal |
1460 rows × 40 columns
cat_sel = list(X_sel.columns[X_sel.dtypes=='object'])
con_sel = list(X_sel.columns[X_sel.dtypes!='object'])
cat_sel
['Alley', 'LandContour', 'Neighborhood', 'BldgType', 'HouseStyle', 'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'KitchenQual', 'Functional', 'GarageFinish', 'GarageCond', 'Fence', 'SaleCondition']
con_sel
['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'BsmtFullBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'WoodDeckSF', 'EnclosedPorch', 'ScreenPorch', 'YrSold']
from sklearn.preprocessing import OneHotEncoder
num_pipe2 = Pipeline(steps=[('impute', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
cat_pipe2 = Pipeline(steps=[('impute', SimpleImputer(strategy='constant', fill_value='NotAvailable')),
('ohe', OneHotEncoder(handle_unknown='ignore'))])
pre2 = ColumnTransformer([('num', num_pipe2, con_sel),
('cat', cat_pipe2, cat_sel)])
X_sel_pre = pre2.fit_transform(X_sel).toarray()
X_sel_pre
array([[ 0.07337496, -0.20714171, 0.65147924, ..., 0. ,
1. , 0. ],
[-0.87256276, -0.09188637, -0.07183611, ..., 0. ,
1. , 0. ],
[ 0.07337496, 0.07347998, 0.65147924, ..., 0. ,
1. , 0. ],
...,
[ 0.30985939, -0.14781027, 0.65147924, ..., 0. ,
1. , 0. ],
[-0.87256276, -0.08016039, -0.79515147, ..., 0. ,
1. , 0. ],
[-0.87256276, -0.05811155, -0.79515147, ..., 0. ,
1. , 0. ]])
final_cols = pre2.get_feature_names_out()
final_cols
array(['num__MSSubClass', 'num__LotArea', 'num__OverallQual',
'num__OverallCond', 'num__YearBuilt', 'num__MasVnrArea',
'num__BsmtFinSF1', 'num__BsmtFinSF2', 'num__BsmtUnfSF',
'num__TotalBsmtSF', 'num__1stFlrSF', 'num__2ndFlrSF',
'num__BsmtFullBath', 'num__KitchenAbvGr', 'num__TotRmsAbvGrd',
'num__Fireplaces', 'num__GarageCars', 'num__WoodDeckSF',
'num__EnclosedPorch', 'num__ScreenPorch', 'num__YrSold',
'cat__Alley_Grvl', 'cat__Alley_NotAvailable', 'cat__Alley_Pave',
'cat__LandContour_Bnk', 'cat__LandContour_HLS',
'cat__LandContour_Low', 'cat__LandContour_Lvl',
'cat__Neighborhood_Blmngtn', 'cat__Neighborhood_Blueste',
'cat__Neighborhood_BrDale', 'cat__Neighborhood_BrkSide',
'cat__Neighborhood_ClearCr', 'cat__Neighborhood_CollgCr',
'cat__Neighborhood_Crawfor', 'cat__Neighborhood_Edwards',
'cat__Neighborhood_Gilbert', 'cat__Neighborhood_IDOTRR',
'cat__Neighborhood_MeadowV', 'cat__Neighborhood_Mitchel',
'cat__Neighborhood_NAmes', 'cat__Neighborhood_NPkVill',
'cat__Neighborhood_NWAmes', 'cat__Neighborhood_NoRidge',
'cat__Neighborhood_NridgHt', 'cat__Neighborhood_OldTown',
'cat__Neighborhood_SWISU', 'cat__Neighborhood_Sawyer',
'cat__Neighborhood_SawyerW', 'cat__Neighborhood_Somerst',
'cat__Neighborhood_StoneBr', 'cat__Neighborhood_Timber',
'cat__Neighborhood_Veenker', 'cat__BldgType_1Fam',
'cat__BldgType_2fmCon', 'cat__BldgType_Duplex',
'cat__BldgType_Twnhs', 'cat__BldgType_TwnhsE',
'cat__HouseStyle_1.5Fin', 'cat__HouseStyle_1.5Unf',
'cat__HouseStyle_1Story', 'cat__HouseStyle_2.5Fin',
'cat__HouseStyle_2.5Unf', 'cat__HouseStyle_2Story',
'cat__HouseStyle_SFoyer', 'cat__HouseStyle_SLvl',
'cat__RoofMatl_ClyTile', 'cat__RoofMatl_CompShg',
'cat__RoofMatl_Membran', 'cat__RoofMatl_Metal',
'cat__RoofMatl_Roll', 'cat__RoofMatl_Tar&Grv',
'cat__RoofMatl_WdShake', 'cat__RoofMatl_WdShngl',
'cat__MasVnrType_BrkCmn', 'cat__MasVnrType_BrkFace',
'cat__MasVnrType_NotAvailable', 'cat__MasVnrType_Stone',
'cat__ExterQual_Ex', 'cat__ExterQual_Fa', 'cat__ExterQual_Gd',
'cat__ExterQual_TA', 'cat__ExterCond_Ex', 'cat__ExterCond_Fa',
'cat__ExterCond_Gd', 'cat__ExterCond_Po', 'cat__ExterCond_TA',
'cat__BsmtQual_Ex', 'cat__BsmtQual_Fa', 'cat__BsmtQual_Gd',
'cat__BsmtQual_NotAvailable', 'cat__BsmtQual_TA',
'cat__BsmtCond_Fa', 'cat__BsmtCond_Gd',
'cat__BsmtCond_NotAvailable', 'cat__BsmtCond_Po',
'cat__BsmtCond_TA', 'cat__BsmtExposure_Av', 'cat__BsmtExposure_Gd',
'cat__BsmtExposure_Mn', 'cat__BsmtExposure_No',
'cat__BsmtExposure_NotAvailable', 'cat__HeatingQC_Ex',
'cat__HeatingQC_Fa', 'cat__HeatingQC_Gd', 'cat__HeatingQC_Po',
'cat__HeatingQC_TA', 'cat__KitchenQual_Ex', 'cat__KitchenQual_Fa',
'cat__KitchenQual_Gd', 'cat__KitchenQual_TA',
'cat__Functional_Maj1', 'cat__Functional_Maj2',
'cat__Functional_Min1', 'cat__Functional_Min2',
'cat__Functional_Mod', 'cat__Functional_Sev',
'cat__Functional_Typ', 'cat__GarageFinish_Fin',
'cat__GarageFinish_NotAvailable', 'cat__GarageFinish_RFn',
'cat__GarageFinish_Unf', 'cat__GarageCond_Ex',
'cat__GarageCond_Fa', 'cat__GarageCond_Gd',
'cat__GarageCond_NotAvailable', 'cat__GarageCond_Po',
'cat__GarageCond_TA', 'cat__Fence_GdPrv', 'cat__Fence_GdWo',
'cat__Fence_MnPrv', 'cat__Fence_MnWw', 'cat__Fence_NotAvailable',
'cat__SaleCondition_Abnorml', 'cat__SaleCondition_AdjLand',
'cat__SaleCondition_Alloca', 'cat__SaleCondition_Family',
'cat__SaleCondition_Normal', 'cat__SaleCondition_Partial'],
dtype=object)
X_sel_pre = pd.DataFrame(X_sel_pre, columns=final_cols)
X_sel_pre.head()
| num__MSSubClass | num__LotArea | num__OverallQual | num__OverallCond | num__YearBuilt | num__MasVnrArea | num__BsmtFinSF1 | num__BsmtFinSF2 | num__BsmtUnfSF | num__TotalBsmtSF | ... | cat__Fence_GdWo | cat__Fence_MnPrv | cat__Fence_MnWw | cat__Fence_NotAvailable | cat__SaleCondition_Abnorml | cat__SaleCondition_AdjLand | cat__SaleCondition_Alloca | cat__SaleCondition_Family | cat__SaleCondition_Normal | cat__SaleCondition_Partial | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.073375 | -0.207142 | 0.651479 | -0.517200 | 1.050994 | 0.514104 | 0.575425 | -0.288653 | -0.944591 | -0.459303 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1 | -0.872563 | -0.091886 | -0.071836 | 2.179628 | 0.156734 | -0.570750 | 1.171992 | -0.288653 | -0.641228 | 0.466465 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 2 | 0.073375 | 0.073480 | 0.651479 | -0.517200 | 0.984752 | 0.325915 | 0.092907 | -0.288653 | -0.301643 | -0.313369 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | 0.309859 | -0.096897 | 0.651479 | -0.517200 | -1.863632 | -0.570750 | -0.499274 | -0.288653 | -0.061670 | -0.687324 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 0.073375 | 0.375148 | 1.374795 | -0.517200 | 0.951632 | 1.366489 | 0.463568 | -0.288653 | -0.174865 | 0.199680 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
5 rows × 139 columns
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X_sel_pre, Y, test_size=0.2, random_state=42)
xtrain.shape
(1168, 139)
xtest.shape
(292, 139)
model = LinearRegression()
model.fit(xtrain, ytrain)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
model.score(xtrain, ytrain)
0.9051756618987917
model.score(xtest, ytest)
-8.38311961342074e+18
from RegPackage import evaluate_model
evaluate_model(xtrain, ytrain, xtest, ytest, model)
Training Results : MSE : 565584541.83 RMSE: 23782.02 MAE : 14554.79 R2 : 0.9052 ==================================== Testing Results : MSE : 64301262068805499080576860160.00 RMSE: 253576935206665.62 MAE : 17274158858732.48 R2 : -8383119613420739584.0000
params = {'alpha': np.arange(0.1, 100, 0.1)}
params
{'alpha': array([ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1,
1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2,
2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3,
3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4. , 4.1, 4.2, 4.3, 4.4,
4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1, 5.2, 5.3, 5.4, 5.5,
5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2, 6.3, 6.4, 6.5, 6.6,
6.7, 6.8, 6.9, 7. , 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7,
7.8, 7.9, 8. , 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8,
8.9, 9. , 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9,
10. , 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11. ,
11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12. , 12.1,
12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13. , 13.1, 13.2,
13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14. , 14.1, 14.2, 14.3,
14.4, 14.5, 14.6, 14.7, 14.8, 14.9, 15. , 15.1, 15.2, 15.3, 15.4,
15.5, 15.6, 15.7, 15.8, 15.9, 16. , 16.1, 16.2, 16.3, 16.4, 16.5,
16.6, 16.7, 16.8, 16.9, 17. , 17.1, 17.2, 17.3, 17.4, 17.5, 17.6,
17.7, 17.8, 17.9, 18. , 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.7,
18.8, 18.9, 19. , 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8,
19.9, 20. , 20.1, 20.2, 20.3, 20.4, 20.5, 20.6, 20.7, 20.8, 20.9,
21. , 21.1, 21.2, 21.3, 21.4, 21.5, 21.6, 21.7, 21.8, 21.9, 22. ,
22.1, 22.2, 22.3, 22.4, 22.5, 22.6, 22.7, 22.8, 22.9, 23. , 23.1,
23.2, 23.3, 23.4, 23.5, 23.6, 23.7, 23.8, 23.9, 24. , 24.1, 24.2,
24.3, 24.4, 24.5, 24.6, 24.7, 24.8, 24.9, 25. , 25.1, 25.2, 25.3,
25.4, 25.5, 25.6, 25.7, 25.8, 25.9, 26. , 26.1, 26.2, 26.3, 26.4,
26.5, 26.6, 26.7, 26.8, 26.9, 27. , 27.1, 27.2, 27.3, 27.4, 27.5,
27.6, 27.7, 27.8, 27.9, 28. , 28.1, 28.2, 28.3, 28.4, 28.5, 28.6,
28.7, 28.8, 28.9, 29. , 29.1, 29.2, 29.3, 29.4, 29.5, 29.6, 29.7,
29.8, 29.9, 30. , 30.1, 30.2, 30.3, 30.4, 30.5, 30.6, 30.7, 30.8,
30.9, 31. , 31.1, 31.2, 31.3, 31.4, 31.5, 31.6, 31.7, 31.8, 31.9,
32. , 32.1, 32.2, 32.3, 32.4, 32.5, 32.6, 32.7, 32.8, 32.9, 33. ,
33.1, 33.2, 33.3, 33.4, 33.5, 33.6, 33.7, 33.8, 33.9, 34. , 34.1,
34.2, 34.3, 34.4, 34.5, 34.6, 34.7, 34.8, 34.9, 35. , 35.1, 35.2,
35.3, 35.4, 35.5, 35.6, 35.7, 35.8, 35.9, 36. , 36.1, 36.2, 36.3,
36.4, 36.5, 36.6, 36.7, 36.8, 36.9, 37. , 37.1, 37.2, 37.3, 37.4,
37.5, 37.6, 37.7, 37.8, 37.9, 38. , 38.1, 38.2, 38.3, 38.4, 38.5,
38.6, 38.7, 38.8, 38.9, 39. , 39.1, 39.2, 39.3, 39.4, 39.5, 39.6,
39.7, 39.8, 39.9, 40. , 40.1, 40.2, 40.3, 40.4, 40.5, 40.6, 40.7,
40.8, 40.9, 41. , 41.1, 41.2, 41.3, 41.4, 41.5, 41.6, 41.7, 41.8,
41.9, 42. , 42.1, 42.2, 42.3, 42.4, 42.5, 42.6, 42.7, 42.8, 42.9,
43. , 43.1, 43.2, 43.3, 43.4, 43.5, 43.6, 43.7, 43.8, 43.9, 44. ,
44.1, 44.2, 44.3, 44.4, 44.5, 44.6, 44.7, 44.8, 44.9, 45. , 45.1,
45.2, 45.3, 45.4, 45.5, 45.6, 45.7, 45.8, 45.9, 46. , 46.1, 46.2,
46.3, 46.4, 46.5, 46.6, 46.7, 46.8, 46.9, 47. , 47.1, 47.2, 47.3,
47.4, 47.5, 47.6, 47.7, 47.8, 47.9, 48. , 48.1, 48.2, 48.3, 48.4,
48.5, 48.6, 48.7, 48.8, 48.9, 49. , 49.1, 49.2, 49.3, 49.4, 49.5,
49.6, 49.7, 49.8, 49.9, 50. , 50.1, 50.2, 50.3, 50.4, 50.5, 50.6,
50.7, 50.8, 50.9, 51. , 51.1, 51.2, 51.3, 51.4, 51.5, 51.6, 51.7,
51.8, 51.9, 52. , 52.1, 52.2, 52.3, 52.4, 52.5, 52.6, 52.7, 52.8,
52.9, 53. , 53.1, 53.2, 53.3, 53.4, 53.5, 53.6, 53.7, 53.8, 53.9,
54. , 54.1, 54.2, 54.3, 54.4, 54.5, 54.6, 54.7, 54.8, 54.9, 55. ,
55.1, 55.2, 55.3, 55.4, 55.5, 55.6, 55.7, 55.8, 55.9, 56. , 56.1,
56.2, 56.3, 56.4, 56.5, 56.6, 56.7, 56.8, 56.9, 57. , 57.1, 57.2,
57.3, 57.4, 57.5, 57.6, 57.7, 57.8, 57.9, 58. , 58.1, 58.2, 58.3,
58.4, 58.5, 58.6, 58.7, 58.8, 58.9, 59. , 59.1, 59.2, 59.3, 59.4,
59.5, 59.6, 59.7, 59.8, 59.9, 60. , 60.1, 60.2, 60.3, 60.4, 60.5,
60.6, 60.7, 60.8, 60.9, 61. , 61.1, 61.2, 61.3, 61.4, 61.5, 61.6,
61.7, 61.8, 61.9, 62. , 62.1, 62.2, 62.3, 62.4, 62.5, 62.6, 62.7,
62.8, 62.9, 63. , 63.1, 63.2, 63.3, 63.4, 63.5, 63.6, 63.7, 63.8,
63.9, 64. , 64.1, 64.2, 64.3, 64.4, 64.5, 64.6, 64.7, 64.8, 64.9,
65. , 65.1, 65.2, 65.3, 65.4, 65.5, 65.6, 65.7, 65.8, 65.9, 66. ,
66.1, 66.2, 66.3, 66.4, 66.5, 66.6, 66.7, 66.8, 66.9, 67. , 67.1,
67.2, 67.3, 67.4, 67.5, 67.6, 67.7, 67.8, 67.9, 68. , 68.1, 68.2,
68.3, 68.4, 68.5, 68.6, 68.7, 68.8, 68.9, 69. , 69.1, 69.2, 69.3,
69.4, 69.5, 69.6, 69.7, 69.8, 69.9, 70. , 70.1, 70.2, 70.3, 70.4,
70.5, 70.6, 70.7, 70.8, 70.9, 71. , 71.1, 71.2, 71.3, 71.4, 71.5,
71.6, 71.7, 71.8, 71.9, 72. , 72.1, 72.2, 72.3, 72.4, 72.5, 72.6,
72.7, 72.8, 72.9, 73. , 73.1, 73.2, 73.3, 73.4, 73.5, 73.6, 73.7,
73.8, 73.9, 74. , 74.1, 74.2, 74.3, 74.4, 74.5, 74.6, 74.7, 74.8,
74.9, 75. , 75.1, 75.2, 75.3, 75.4, 75.5, 75.6, 75.7, 75.8, 75.9,
76. , 76.1, 76.2, 76.3, 76.4, 76.5, 76.6, 76.7, 76.8, 76.9, 77. ,
77.1, 77.2, 77.3, 77.4, 77.5, 77.6, 77.7, 77.8, 77.9, 78. , 78.1,
78.2, 78.3, 78.4, 78.5, 78.6, 78.7, 78.8, 78.9, 79. , 79.1, 79.2,
79.3, 79.4, 79.5, 79.6, 79.7, 79.8, 79.9, 80. , 80.1, 80.2, 80.3,
80.4, 80.5, 80.6, 80.7, 80.8, 80.9, 81. , 81.1, 81.2, 81.3, 81.4,
81.5, 81.6, 81.7, 81.8, 81.9, 82. , 82.1, 82.2, 82.3, 82.4, 82.5,
82.6, 82.7, 82.8, 82.9, 83. , 83.1, 83.2, 83.3, 83.4, 83.5, 83.6,
83.7, 83.8, 83.9, 84. , 84.1, 84.2, 84.3, 84.4, 84.5, 84.6, 84.7,
84.8, 84.9, 85. , 85.1, 85.2, 85.3, 85.4, 85.5, 85.6, 85.7, 85.8,
85.9, 86. , 86.1, 86.2, 86.3, 86.4, 86.5, 86.6, 86.7, 86.8, 86.9,
87. , 87.1, 87.2, 87.3, 87.4, 87.5, 87.6, 87.7, 87.8, 87.9, 88. ,
88.1, 88.2, 88.3, 88.4, 88.5, 88.6, 88.7, 88.8, 88.9, 89. , 89.1,
89.2, 89.3, 89.4, 89.5, 89.6, 89.7, 89.8, 89.9, 90. , 90.1, 90.2,
90.3, 90.4, 90.5, 90.6, 90.7, 90.8, 90.9, 91. , 91.1, 91.2, 91.3,
91.4, 91.5, 91.6, 91.7, 91.8, 91.9, 92. , 92.1, 92.2, 92.3, 92.4,
92.5, 92.6, 92.7, 92.8, 92.9, 93. , 93.1, 93.2, 93.3, 93.4, 93.5,
93.6, 93.7, 93.8, 93.9, 94. , 94.1, 94.2, 94.3, 94.4, 94.5, 94.6,
94.7, 94.8, 94.9, 95. , 95.1, 95.2, 95.3, 95.4, 95.5, 95.6, 95.7,
95.8, 95.9, 96. , 96.1, 96.2, 96.3, 96.4, 96.5, 96.6, 96.7, 96.8,
96.9, 97. , 97.1, 97.2, 97.3, 97.4, 97.5, 97.6, 97.7, 97.8, 97.9,
98. , 98.1, 98.2, 98.3, 98.4, 98.5, 98.6, 98.7, 98.8, 98.9, 99. ,
99.1, 99.2, 99.3, 99.4, 99.5, 99.6, 99.7, 99.8, 99.9])}
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
rr = Ridge()
gscv1 = GridSearchCV(rr, param_grid=params, cv=5, scoring='neg_mean_squared_error')
gscv1.fit(xtrain, ytrain)
GridSearchCV(cv=5, estimator=Ridge(),
param_grid={'alpha': array([ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1,
1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2,
2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3,
3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4. , 4.1, 4.2, 4.3, 4.4,
4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1, 5.2, 5.3, 5.4, 5.5,
5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2, 6.3, 6.4, 6.5, 6.6,
6.7, 6.8, 6.9, 7. , 7.1, 7.2, 7.3, 7....
93.6, 93.7, 93.8, 93.9, 94. , 94.1, 94.2, 94.3, 94.4, 94.5, 94.6,
94.7, 94.8, 94.9, 95. , 95.1, 95.2, 95.3, 95.4, 95.5, 95.6, 95.7,
95.8, 95.9, 96. , 96.1, 96.2, 96.3, 96.4, 96.5, 96.6, 96.7, 96.8,
96.9, 97. , 97.1, 97.2, 97.3, 97.4, 97.5, 97.6, 97.7, 97.8, 97.9,
98. , 98.1, 98.2, 98.3, 98.4, 98.5, 98.6, 98.7, 98.8, 98.9, 99. ,
99.1, 99.2, 99.3, 99.4, 99.5, 99.6, 99.7, 99.8, 99.9])},
scoring='neg_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=Ridge(),
param_grid={'alpha': array([ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1,
1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2,
2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3,
3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4. , 4.1, 4.2, 4.3, 4.4,
4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1, 5.2, 5.3, 5.4, 5.5,
5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2, 6.3, 6.4, 6.5, 6.6,
6.7, 6.8, 6.9, 7. , 7.1, 7.2, 7.3, 7....
93.6, 93.7, 93.8, 93.9, 94. , 94.1, 94.2, 94.3, 94.4, 94.5, 94.6,
94.7, 94.8, 94.9, 95. , 95.1, 95.2, 95.3, 95.4, 95.5, 95.6, 95.7,
95.8, 95.9, 96. , 96.1, 96.2, 96.3, 96.4, 96.5, 96.6, 96.7, 96.8,
96.9, 97. , 97.1, 97.2, 97.3, 97.4, 97.5, 97.6, 97.7, 97.8, 97.9,
98. , 98.1, 98.2, 98.3, 98.4, 98.5, 98.6, 98.7, 98.8, 98.9, 99. ,
99.1, 99.2, 99.3, 99.4, 99.5, 99.6, 99.7, 99.8, 99.9])},
scoring='neg_mean_squared_error')Ridge()
Ridge()
gscv1.best_params_
{'alpha': 1.3000000000000003}
gscv1.best_score_
-1025505431.8491377
best_ridge = gscv1.best_estimator_
best_ridge
Ridge(alpha=1.3000000000000003)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Ridge(alpha=1.3000000000000003)
best_ridge.score(xtrain, ytrain)
0.8938175920123184
best_ridge.score(xtest, ytest)
0.8898912783626156
evaluate_model(xtrain, ytrain, xtest, ytest, best_ridge)
Training Results : MSE : 633330321.89 RMSE: 25166.05 MAE : 15450.20 R2 : 0.8938 ==================================== Testing Results : MSE : 844569813.22 RMSE: 29061.48 MAE : 17774.83 R2 : 0.8899
params2 = {'alpha':np.arange(180, 200, 0.1)}
params2
{'alpha': array([180. , 180.1, 180.2, 180.3, 180.4, 180.5, 180.6, 180.7, 180.8,
180.9, 181. , 181.1, 181.2, 181.3, 181.4, 181.5, 181.6, 181.7,
181.8, 181.9, 182. , 182.1, 182.2, 182.3, 182.4, 182.5, 182.6,
182.7, 182.8, 182.9, 183. , 183.1, 183.2, 183.3, 183.4, 183.5,
183.6, 183.7, 183.8, 183.9, 184. , 184.1, 184.2, 184.3, 184.4,
184.5, 184.6, 184.7, 184.8, 184.9, 185. , 185.1, 185.2, 185.3,
185.4, 185.5, 185.6, 185.7, 185.8, 185.9, 186. , 186.1, 186.2,
186.3, 186.4, 186.5, 186.6, 186.7, 186.8, 186.9, 187. , 187.1,
187.2, 187.3, 187.4, 187.5, 187.6, 187.7, 187.8, 187.9, 188. ,
188.1, 188.2, 188.3, 188.4, 188.5, 188.6, 188.7, 188.8, 188.9,
189. , 189.1, 189.2, 189.3, 189.4, 189.5, 189.6, 189.7, 189.8,
189.9, 190. , 190.1, 190.2, 190.3, 190.4, 190.5, 190.6, 190.7,
190.8, 190.9, 191. , 191.1, 191.2, 191.3, 191.4, 191.5, 191.6,
191.7, 191.8, 191.9, 192. , 192.1, 192.2, 192.3, 192.4, 192.5,
192.6, 192.7, 192.8, 192.9, 193. , 193.1, 193.2, 193.3, 193.4,
193.5, 193.6, 193.7, 193.8, 193.9, 194. , 194.1, 194.2, 194.3,
194.4, 194.5, 194.6, 194.7, 194.8, 194.9, 195. , 195.1, 195.2,
195.3, 195.4, 195.5, 195.6, 195.7, 195.8, 195.9, 196. , 196.1,
196.2, 196.3, 196.4, 196.5, 196.6, 196.7, 196.8, 196.9, 197. ,
197.1, 197.2, 197.3, 197.4, 197.5, 197.6, 197.7, 197.8, 197.9,
198. , 198.1, 198.2, 198.3, 198.4, 198.5, 198.6, 198.7, 198.8,
198.9, 199. , 199.1, 199.2, 199.3, 199.4, 199.5, 199.6, 199.7,
199.8, 199.9])}
from sklearn.linear_model import Lasso
ls = Lasso()
gscv2 = GridSearchCV(ls, param_grid=params2, cv=5, scoring='neg_mean_squared_error')
gscv2.fit(xtrain, ytrain)
GridSearchCV(cv=5, estimator=Lasso(),
param_grid={'alpha': array([180. , 180.1, 180.2, 180.3, 180.4, 180.5, 180.6, 180.7, 180.8,
180.9, 181. , 181.1, 181.2, 181.3, 181.4, 181.5, 181.6, 181.7,
181.8, 181.9, 182. , 182.1, 182.2, 182.3, 182.4, 182.5, 182.6,
182.7, 182.8, 182.9, 183. , 183.1, 183.2, 183.3, 183.4, 183.5,
183.6, 183.7, 183.8, 183.9, 184. , 184.1, 184.2, 184.3, 184.4,
184.5, 184.6, 184.7, 184....
194.4, 194.5, 194.6, 194.7, 194.8, 194.9, 195. , 195.1, 195.2,
195.3, 195.4, 195.5, 195.6, 195.7, 195.8, 195.9, 196. , 196.1,
196.2, 196.3, 196.4, 196.5, 196.6, 196.7, 196.8, 196.9, 197. ,
197.1, 197.2, 197.3, 197.4, 197.5, 197.6, 197.7, 197.8, 197.9,
198. , 198.1, 198.2, 198.3, 198.4, 198.5, 198.6, 198.7, 198.8,
198.9, 199. , 199.1, 199.2, 199.3, 199.4, 199.5, 199.6, 199.7,
199.8, 199.9])},
scoring='neg_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=Lasso(),
param_grid={'alpha': array([180. , 180.1, 180.2, 180.3, 180.4, 180.5, 180.6, 180.7, 180.8,
180.9, 181. , 181.1, 181.2, 181.3, 181.4, 181.5, 181.6, 181.7,
181.8, 181.9, 182. , 182.1, 182.2, 182.3, 182.4, 182.5, 182.6,
182.7, 182.8, 182.9, 183. , 183.1, 183.2, 183.3, 183.4, 183.5,
183.6, 183.7, 183.8, 183.9, 184. , 184.1, 184.2, 184.3, 184.4,
184.5, 184.6, 184.7, 184....
194.4, 194.5, 194.6, 194.7, 194.8, 194.9, 195. , 195.1, 195.2,
195.3, 195.4, 195.5, 195.6, 195.7, 195.8, 195.9, 196. , 196.1,
196.2, 196.3, 196.4, 196.5, 196.6, 196.7, 196.8, 196.9, 197. ,
197.1, 197.2, 197.3, 197.4, 197.5, 197.6, 197.7, 197.8, 197.9,
198. , 198.1, 198.2, 198.3, 198.4, 198.5, 198.6, 198.7, 198.8,
198.9, 199. , 199.1, 199.2, 199.3, 199.4, 199.5, 199.6, 199.7,
199.8, 199.9])},
scoring='neg_mean_squared_error')Lasso()
Lasso()
gscv2.best_params_
{'alpha': 180.0}
gscv2.best_score_
-1017797737.3011879
best_lasso = gscv2.best_estimator_
best_lasso
Lasso(alpha=180.0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Lasso(alpha=180.0)
best_lasso.score(xtrain, ytrain)
0.8878640283403164
best_lasso.score(xtest, ytest)
0.8884719356566722
evaluate_model(xtrain, ytrain, xtest, ytest, best_lasso)
Training Results : MSE : 668840652.35 RMSE: 25861.95 MAE : 15751.48 R2 : 0.8879 ==================================== Testing Results : MSE : 855456634.77 RMSE: 29248.19 MAE : 17315.94 R2 : 0.8885
xnew = pd.read_csv('sample_set.csv')
xnew.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1461 | 20 | RH | 80.0 | 11622 | Pave | NaN | Reg | Lvl | AllPub | ... | 120 | 0 | NaN | MnPrv | NaN | 0 | 6 | 2010 | WD | Normal |
| 1 | 1462 | 20 | RL | 81.0 | 14267 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | 0 | NaN | NaN | Gar2 | 12500 | 6 | 2010 | WD | Normal |
| 2 | 1463 | 60 | RL | 74.0 | 13830 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | 0 | NaN | MnPrv | NaN | 0 | 3 | 2010 | WD | Normal |
| 3 | 1464 | 60 | RL | 78.0 | 9978 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | 0 | NaN | NaN | NaN | 0 | 6 | 2010 | WD | Normal |
| 4 | 1465 | 120 | RL | 43.0 | 5005 | Pave | NaN | IR1 | HLS | AllPub | ... | 144 | 0 | NaN | NaN | NaN | 0 | 1 | 2010 | WD | Normal |
5 rows × 80 columns
s = xnew.isna().sum()
s[s>0]
MSZoning 4 LotFrontage 227 Alley 1352 Utilities 2 Exterior1st 1 Exterior2nd 1 MasVnrType 894 MasVnrArea 15 BsmtQual 44 BsmtCond 45 BsmtExposure 44 BsmtFinType1 42 BsmtFinSF1 1 BsmtFinType2 42 BsmtFinSF2 1 BsmtUnfSF 1 TotalBsmtSF 1 BsmtFullBath 2 BsmtHalfBath 2 KitchenQual 1 Functional 2 FireplaceQu 730 GarageType 76 GarageYrBlt 78 GarageFinish 78 GarageCars 1 GarageArea 1 GarageQual 78 GarageCond 78 PoolQC 1456 Fence 1169 MiscFeature 1408 SaleType 1 dtype: int64
# Apply pre.transform on out of sample data
xnew_pre = pre2.transform(xnew).toarray()
xnew_pre
array([[-0.87256276, 0.11076257, -0.79515147, ..., 0. ,
1. , 0. ],
[-0.87256276, 0.37584985, -0.07183611, ..., 0. ,
1. , 0. ],
[ 0.07337496, 0.33205282, -0.79515147, ..., 0. ,
1. , 0. ],
...,
[-0.87256276, 0.95042275, -0.79515147, ..., 0. ,
0. , 0. ],
[ 0.66458604, -0.00759964, -0.79515147, ..., 0. ,
1. , 0. ],
[ 0.07337496, -0.08918038, 0.65147924, ..., 0. ,
1. , 0. ]])
xnew_pre = pd.DataFrame(xnew_pre, columns=final_cols)
xnew_pre.head()
| num__MSSubClass | num__LotArea | num__OverallQual | num__OverallCond | num__YearBuilt | num__MasVnrArea | num__BsmtFinSF1 | num__BsmtFinSF2 | num__BsmtUnfSF | num__TotalBsmtSF | ... | cat__Fence_GdWo | cat__Fence_MnPrv | cat__Fence_MnWw | cat__Fence_NotAvailable | cat__SaleCondition_Abnorml | cat__SaleCondition_AdjLand | cat__SaleCondition_Alloca | cat__SaleCondition_Family | cat__SaleCondition_Normal | cat__SaleCondition_Partial | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.872563 | 0.110763 | -0.795151 | 0.381743 | -0.340077 | -0.570750 | 0.053428 | 0.604293 | -0.672923 | -0.400017 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1 | -0.872563 | 0.375850 | -0.071836 | 0.381743 | -0.439440 | 0.027027 | 1.051363 | -0.288653 | -0.365032 | 0.619239 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 2 | 0.073375 | 0.332053 | -0.795151 | -0.517200 | 0.852269 | -0.570750 | 0.761852 | -0.288653 | -0.974021 | -0.295127 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | 0.073375 | -0.054002 | -0.071836 | 0.381743 | 0.885390 | -0.460051 | 0.347326 | -0.288653 | -0.550672 | -0.299687 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 4 | 1.492282 | -0.552407 | 1.374795 | -0.517200 | 0.686666 | -0.570750 | -0.396190 | -0.288653 | 1.018211 | 0.507509 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
5 rows × 139 columns
preds = best_lasso.predict(xnew_pre)
preds
array([118792.07533453, 155091.35908051, 168783.86045418, ...,
175422.71143984, 110928.46269249, 222777.99541006])
df_final = xnew[['Id']]
df_final
| Id | |
|---|---|
| 0 | 1461 |
| 1 | 1462 |
| 2 | 1463 |
| 3 | 1464 |
| 4 | 1465 |
| ... | ... |
| 1454 | 2915 |
| 1455 | 2916 |
| 1456 | 2917 |
| 1457 | 2918 |
| 1458 | 2919 |
1459 rows × 1 columns
df_final['SalePrice'] = preds
df_final
| Id | SalePrice | |
|---|---|---|
| 0 | 1461 | 118792.075335 |
| 1 | 1462 | 155091.359081 |
| 2 | 1463 | 168783.860454 |
| 3 | 1464 | 186538.491042 |
| 4 | 1465 | 216796.324181 |
| ... | ... | ... |
| 1454 | 2915 | 81657.630708 |
| 1455 | 2916 | 77637.340257 |
| 1456 | 2917 | 175422.711440 |
| 1457 | 2918 | 110928.462692 |
| 1458 | 2919 | 222777.995410 |
1459 rows × 2 columns
df_final['SalePrice'] = df_final['SalePrice'].apply(round, args=(2,))
df_final
| Id | SalePrice | |
|---|---|---|
| 0 | 1461 | 118792.08 |
| 1 | 1462 | 155091.36 |
| 2 | 1463 | 168783.86 |
| 3 | 1464 | 186538.49 |
| 4 | 1465 | 216796.32 |
| ... | ... | ... |
| 1454 | 2915 | 81657.63 |
| 1455 | 2916 | 77637.34 |
| 1456 | 2917 | 175422.71 |
| 1457 | 2918 | 110928.46 |
| 1458 | 2919 | 222778.00 |
1459 rows × 2 columns
df_final.to_csv('HouseSalespricePred.csv', index=False)